Source Code of org.terrier.structures.indexing.InvertedIndexBuilder

/*
 * Terrier - Terabyte Retriever
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is InvertedIndexBuilder.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
 *   Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
 */
package org.terrier.structures.indexing;


import gnu.trove.TIntArrayList;
import gnu.trove.TIntIntHashMap;


import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;


import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;
import org.terrier.compression.BitIn;
import org.terrier.compression.BitOut;
import org.terrier.compression.BitOutputStream;
import org.terrier.structures.BitIndexPointer;
import org.terrier.structures.DirectIndexInputStream;
import org.terrier.structures.FSOMapFileLexicon;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.Index;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.SimpleBitIndexPointer;
import org.terrier.structures.postings.BasicIterablePosting;
import org.terrier.structures.postings.FieldIterablePosting;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.FieldScore;
import org.terrier.utility.Files;
import org.terrier.utility.Rounding;
/**
 * Builds an inverted index. It optionally saves term-field information as well. 
 * <p><b>Algorithm:</b>
 * <ol>
 * <li>While there are terms left:
 *  <ol>
 *  <li>Read M term ids from lexicon, in lexicographical order</li>
 *  <li>Read the occurrences of these M terms into memory from the direct file</li>
 *  <li>Write the occurrences of these M terms to the inverted file</li>
 *  </ol>
 * <li>Rewrite the lexicon, removing block frequencies, and adding inverted file offsets</li>
 * <li>Write the collection statistics</li>
 * </ol>
 * <p><b>Lexicon term selection:</b>
 * There are two strategies of selecting the number of terms to read from the lexicon. The trade-off here
 * is to read a small enough number of terms into memory such that the occurrences of all those terms from
 * the direct file can fit in memory. On the other hand, the less terms that are read implies more iterations,
 * which is I/O expensive, as the entire direct file has to be read for every iteration.<br>
 * The two strategies are:
 * <ul>
 * <li>Read a fixed number of terms on each iterations - this corresponds to the property
 *  <tt>invertedfile.processterms</tt></li>
 * <li>Read a fixed number of occurrences (pointers) on each iteration. The number of pointers can be determined
 *  using the sum of frequencies of each term from the lexicon. This corresponds to the property
 *  <tt>invertedfile.processpointers</tt>. 
 * </li></ul>
 * By default, the 2nd strategy is chosen, unless the <tt>invertedfile.processpointers</tt> has a zero
 * value specified.<P>
 * Properties:
 * <ul>
 *  <li><tt>invertedfile.processterms</tt>- the number of terms to process in each iteration. Defaults to 75,000</li>
 *  <li><tt>invertedfile.processpointers</tt> - the number of pointers to process in each iteration. Defaults to 20,000,000</li>
 * </ul>
 * @author Craig Macdonald &amp; Vassilis Plachouras
  */
public class InvertedIndexBuilder {


  /** class to be used as a lexiconoutpustream. set by this and child classes */
  protected Class<?> lexiconOutputStream = null;




  /** The logger used */
  protected static final Logger logger = Logger.getLogger(InvertedIndexBuilder.class);
  
  protected static class IntLongTuple
  {
    final int Terms;
    final long Pointers;
    IntLongTuple(int a, long b)
    {
      Terms = a;
      Pointers = b;
    }
  }
  
  protected int fieldCount = 0;


  /** Indicates whether field information is used. */
  protected boolean useFieldInformation;
  
  protected Index index = null;
  
  protected String structureName = null;
  
  /** The number of pointers to be processed in an interation. This directly corresponds to the
    * property <tt>invertedfile.processpointers</tt>. If this property is set and > 0, then each
    * iteration of the inverted index creation will be done to a set number of pointers, not a set
    * number of terms, overriding <tt>invertedfile.processterms</tt>. Default is 20000000. */
  protected long numberOfPointersPerIteration = Long.parseLong(
    ApplicationSetup.getProperty("invertedfile.processpointers", "20000000"));
  
  /**
   * The underlying bit file.
   */
  protected BitOut file;


  /**
   * contructor
   * @param i
   * @param _structureName
   */
  public InvertedIndexBuilder(Index i, String _structureName)
  {
    this.index = i;
    this.structureName = _structureName;
    
    try{
      file = new BitOutputStream(index.getPath() + "/"+ index.getPrefix() + "." +structureName + BitIn.USUAL_EXTENSION);
    } catch (IOException ioe) {
      logger.error("creating BitOutputStream for writing the inverted file : ", ioe);
    }
    lexiconOutputStream = LexiconOutputStream.class;
  }






  /**
   * Closes the underlying bit file.
   */
  public void close() throws IOException {
    file.close();
  }


  /**
   * Creates the inverted index using the already created direct index,
   * document index and lexicon.
   */
  @SuppressWarnings("unchecked")
  public void createInvertedIndex() {
    try {
      Runtime r = Runtime.getRuntime();
      logger.debug("creating inverted index");
      final String LexiconFilename = index.getPath() + "/" + index.getPrefix() + ".lexicon";
      
      //final int _numberOfDocuments = index.getCollectionStatistics().getNumberOfDocuments();
    
      long assumedNumberOfPointers = Long.parseLong(index.getIndexProperty("num.Pointers", "0"));        
      long _numberOfTokens = 0;
      long _numberOfPointers = 0;
      int _numberOfUniqueTerms = index.getCollectionStatistics().getNumberOfUniqueTerms();
      
      fieldCount = index.getIntIndexProperty("index.direct.fields.count", 0);
      this.useFieldInformation = fieldCount > 0;
      Iterator<Map.Entry<String,LexiconEntry>> lexiconStream = 
        (Iterator<Map.Entry<String,LexiconEntry>>)index.getIndexStructureInputStream("lexicon");
    
      //A temporary file for storing the updated lexicon file, after
      // creating the inverted file
      DataOutputStream dos = new DataOutputStream(Files.writeFileStream(LexiconFilename.concat(".tmp2")));


      //if the set number of terms to process is higher than the
      // available,
      if (processTerms > _numberOfUniqueTerms)
        processTerms = (int) _numberOfUniqueTerms;
      long startProcessingLexicon = 0;
      long startTraversingDirectFile = 0;
      long startWritingInvertedFile = 0;
      long numberOfPointersThisIteration = 0;
      
      int i=0; int iterationCounter = 0;
      // generate a message guessing iteration counts
      String iteration_message_suffix = null;
      if (numberOfPointersPerIteration > 0 || processTerms ==0)
      {
        if (assumedNumberOfPointers > 0)
        {
          iteration_message_suffix = " of "
            + ((assumedNumberOfPointers % numberOfPointersPerIteration ==0 )
              ? (assumedNumberOfPointers/numberOfPointersPerIteration)
              : 1+(assumedNumberOfPointers/numberOfPointersPerIteration))
            + " iterations";
        }
        else
        {
          iteration_message_suffix = "";
        }
      }
      else
      {
        iteration_message_suffix = " of "
          + ((_numberOfUniqueTerms % processTerms ==0 ) 
            ? (_numberOfUniqueTerms/processTerms)
            : 1+(_numberOfUniqueTerms/processTerms))
          + " iterations";
      }


      if (numberOfPointersPerIteration == 0)
      {
        //logger.warn("Using old-fashioned number of terms strategy. Please consider setting invertedfile.processpointers for forward compatible use");
      }
    
      while(i<_numberOfUniqueTerms)
      {
        iterationCounter++;
        TIntIntHashMap codesHashMap = null;
        TIntArrayList[][] tmpStorage = null;
        IntLongTuple results = null;
        
        //logger.info("Iteration "+iterationCounter+iteration_message_suffix);
        
        //traverse the lexicon looking to determine the first N() terms
        //this can be done two ways: for the first X terms
        //OR for the first Y pointers
        //ie either N=X, or N=fn(Y)
        
        startProcessingLexicon = System.currentTimeMillis();
        
        if (numberOfPointersPerIteration > 0)
        {//we've been configured to run with a given number of pointers
          if (logger.isDebugEnabled())
            logger.debug("Scanning lexicon for "+ numberOfPointersPerIteration + " pointers");
        
          /* this is less speed efficient, as we have no way to guess how many
           * terms it will take to fill the given number of pointers. 
           * The advantage is that memory consumption is more directly correlated
           * to number of pointers than number of terms, so when indexing tricky
           * collections, it is easier to find a number of pointers that can fit
           * in memory */
           
          codesHashMap = new TIntIntHashMap();
          ArrayList<TIntArrayList[]> tmpStorageStorage = new ArrayList<TIntArrayList[]>();
          results = scanLexiconForPointers(
            numberOfPointersPerIteration, 
            lexiconStream,
            codesHashMap,
            tmpStorageStorage);
          tmpStorage = (TIntArrayList[][]) tmpStorageStorage.toArray(
            new TIntArrayList[0][0]);
          
        }
        else//we're running with a given number of terms
        {
          if (logger.isDebugEnabled())
            logger.debug("Scanning lexicon for " + processTerms+" terms");
          tmpStorage = new TIntArrayList[processTerms][];
          codesHashMap = new TIntIntHashMap(processTerms);
          results = scanLexiconForTerms(
            processTerms,
            lexiconStream,
            codesHashMap,
            tmpStorage);
        }
        
        processTerms = results.Terms;//no of terms to process on this iteration
        numberOfPointersThisIteration = results.Pointers;
        _numberOfPointers += results.Pointers;//no of pointers to process on this iteration
        logger.debug("Selected " + results.Terms + " terms, " + results.Pointers + " pointers for this iteration");
        
        if (results.Terms == 0)
        {
          //logger.warn("No terms found this iteration - presuming end of iteration cycle (perhaps some lexicon terms are empty)");
          break;
        }
        i += processTerms;
        
        if (logger.isDebugEnabled())
          logger.debug("time to process part of lexicon: " + ((System.currentTimeMillis()- startProcessingLexicon) / 1000D));
        
        
        displayMemoryUsage(r);  
        
        //Scan the direct file looking for those terms
        startTraversingDirectFile = System.currentTimeMillis();
        traverseDirectFile(codesHashMap, tmpStorage);
        if (logger.isDebugEnabled())
          logger.debug("time to traverse direct file: " + ((System.currentTimeMillis() - startTraversingDirectFile) / 1000D));
        
        displayMemoryUsage(r);      
  
        //write the inverted file for this part of the lexicon, ie processTerms number of terms
        startWritingInvertedFile = System.currentTimeMillis();
        _numberOfTokens += writeInvertedFilePart(dos, tmpStorage, processTerms);
        if (logger.isDebugEnabled())
          logger.debug("time to write inverted file: "
           + ((System.currentTimeMillis()- startWritingInvertedFile) / 1000D));
        
              
        displayMemoryUsage(r);
  
        if (logger.isDebugEnabled()) {
          logger.debug(
              "time to perform one iteration: "
                + ((System.currentTimeMillis() - startProcessingLexicon)
                  / 1000D));
          logger.debug(
            "number of pointers processed: "
              + numberOfPointersThisIteration);  
        }
        
        
        tmpStorage  = null; 
        codesHashMap.clear(); 
        codesHashMap = null;
      }
      
      
      
      file.close();
      IndexUtil.close(lexiconStream);
      dos.close();
      
//      this.numberOfDocuments = _numberOfDocuments;
//      this.numberOfTokens = _numberOfTokens;
//      this.numberOfUniqueTerms = _numberOfUniqueTerms;
//      this.numberOfPointers = _numberOfPointers;


      //finalising the lexicon file with the updated information
      //on the frequencies and the offsets
      //reading the original lexicon
      lexiconStream = (Iterator<Map.Entry<String,LexiconEntry>>)index.getIndexStructureInputStream("lexicon");
      
      
      //the updated lexicon
      LexiconOutputStream<String> los = getLexOutputStream("tmplexicon");
      
      //the temporary data containing the offsets
      DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2")));
      BitIndexPointer pin = new SimpleBitIndexPointer();
      while(lexiconStream.hasNext())
      {
        Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
        LexiconEntry value = lee.getValue();
        pin.readFields(dis);
        value.setPointer(pin);
        los.writeNextEntry(lee.getKey(), value);
      }
      IndexUtil.close(lexiconStream);
      los.close();
      dis.close();
      Files.delete(LexiconFilename.concat(".tmp2"));
      FSOMapFileLexicon.deleteMapFileLexicon("lexicon", index.getPath(), index.getPrefix());
      FSOMapFileLexicon.renameMapFileLexicon(
          "tmplexicon", index.getPath(), index.getPrefix(), 
          "lexicon", index.getPath(), index.getPrefix());
      
      index.addIndexStructure(
          structureName, 
          "org.terrier.structures.InvertedIndex", 
          "org.terrier.structures.Index,java.lang.String,org.terrier.structures.DocumentIndex,java.lang.Class", 
          "index,structureName,document,"+
            (FieldScore.FIELDS_COUNT > 0 ? FieldIterablePosting.class.getName() : BasicIterablePosting.class.getName() ));
      index.addIndexStructureInputStream(
          structureName,
                    "org.terrier.structures.InvertedIndexInputStream",
                    "org.terrier.structures.Index,java.lang.String,java.util.Iterator,java.lang.Class",
                    "index,structureName,lexicon-entry-inputstream,"+
                    (FieldScore.FIELDS_COUNT > 0 ? FieldIterablePosting.class.getName() : BasicIterablePosting.class.getName() ));
      index.setIndexProperty("index.inverted.fields.count", ""+FieldScore.FIELDS_COUNT );
      index.setIndexProperty("index.inverted.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));
      //should be already set, but in case their not
      index.setIndexProperty("num.Terms", ""+_numberOfUniqueTerms);
      index.setIndexProperty("num.Tokens", ""+_numberOfTokens);
      index.setIndexProperty("num.Pointers", ""+_numberOfPointers);
      index.flush();
      System.gc();
      
    } catch (IOException ioe) {
      logger.error("IOException occured during creating the inverted file. Stack trace follows.", ioe);
    }
  }
  
  protected TIntArrayList[] createPointerForTerm(LexiconEntry le)
  {
    TIntArrayList[] tmpArray = new TIntArrayList[2 + fieldCount];
    final int tmpNT = le.getDocumentFrequency();
    for(int i = 0; i < fieldCount+2; i++)
      tmpArray[i] = new TIntArrayList(tmpNT);
    return tmpArray;
  }
  
  /** Iterates through the lexicon, until it has reached the given number of pointers
    * @param PointersToProcess Number of pointers to stop reading the lexicon after
    * @param lexiconStream the lexicon input stream to read 
    * @param codesHashMap
    * @param tmpStorageStorage
    * @return IntLongTuple number of terms, number of pointers
    */
  protected IntLongTuple scanLexiconForPointers(
    final long PointersToProcess, 
    final Iterator<Map.Entry<String,LexiconEntry>> lexiconStream, 
    final TIntIntHashMap codesHashMap,
    final ArrayList<TIntArrayList[]> tmpStorageStorage)
    throws IOException
  {
    int _processTerms = 0;  
    long numberOfPointersThisIteration = 0;
    int j=0; //counter of loop iterations
    while(numberOfPointersThisIteration < PointersToProcess) {
    
      if (! lexiconStream.hasNext())
        break;
      
      Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
      LexiconEntry le = lee.getValue();
      
      _processTerms++;      
      numberOfPointersThisIteration += le.getDocumentFrequency();    
      tmpStorageStorage.add(createPointerForTerm(le));
      
      //the class TIntIntHashMap return zero when you look up for a
      //the value of a key that does not exist in the hash map.
      //For this reason, the values that will be inserted in the 
      //hash map are increased by one. 
      codesHashMap.put(le.getTermId(), j + 1);
      
      //increment counter
      j++;
    }
    if (logger.isDebugEnabled())
      logger.debug(
          numberOfPointersThisIteration + " pointers == "+
          _processTerms +" terms");
    return new IntLongTuple(_processTerms, numberOfPointersThisIteration);
  }
  
  
  /** Iterates through the lexicon, until it has reached the given number of terms
    * @param _processTerms Number of terms to stop reading the lexicon after
    * @param lexiconStream the lexicon input stream to read 
    * @param codesHashMap mapping of termids to which offset in the storage array for terms to be processed this iteration
    * @param tmpStorage place to put postings for this iteration
    * @return IntLongTuple number of terms, number of pointers
    */
  protected IntLongTuple scanLexiconForTerms(
    final int _processTerms, 
    final Iterator<Map.Entry<String,LexiconEntry>> lexiconStream, 
    final TIntIntHashMap codesHashMap,
    TIntArrayList[][] tmpStorage)
    throws IOException
  {
    int j = 0; 
    
    long numberOfPointersThisIteration = 0;
    for (; j < _processTerms; j++) {
    
      if (! lexiconStream.hasNext())
        break;
    
      Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
      LexiconEntry le = lee.getValue();
    
      TIntArrayList[] tmpArray = new TIntArrayList[2 + fieldCount];
      final int tmpNT = le.getDocumentFrequency();
      for (int i=0;i<2+fieldCount;i++)
      {
        tmpArray[i] = new TIntArrayList(tmpNT);
      }
      
      numberOfPointersThisIteration += tmpNT;
      
      tmpStorage[j] = tmpArray;
      
      
      //the class TIntIntHashMap return zero when you look up for a
      //the value of a key that does not exist in the hash map.
      //For this reason, the values that will be inserted in the 
      //hash map are increased by one. 
      codesHashMap.put(le.getTermId(), j + 1);
    }
    if (logger.isDebugEnabled())
      logger.debug(
        numberOfPointersThisIteration + " pointers == "+
        j +" terms");
    return new IntLongTuple(j, numberOfPointersThisIteration);
  }


  /**
   * Traverses the direct index and creates the inverted index entries 
   * for the terms specified in the codesHashMap and tmpStorage.
   * @param tmpStorage TIntArrayList[][] an array of the inverted index entries to store
   * @param codesHashMap a mapping from the term identifiers to the index 
   *    in the tmpStorage matrix. 
   * @throws IOException if there is a problem while traversing the direct index.
   */
   
  protected void traverseDirectFile(TIntIntHashMap codesHashMap, TIntArrayList[][] tmpStorage) 
    throws IOException 
  {
    //scan the direct file
    DirectIndexInputStream directInputStream = (DirectIndexInputStream)index.getIndexStructureInputStream("direct");
    int[][] documentTerms = null;
    int p = 0; //a document counter;
    final boolean _useFieldInformation = this.useFieldInformation;
    while ((documentTerms = directInputStream.getNextTerms())
      != null) {
      p += directInputStream.getEntriesSkipped();
      //the two next vectors are used for reducing the number of references
      final int[] documentTerms0 = documentTerms[0];
      final int[] termfreqs = documentTerms[1];
      //int[] htmlscores = null;
      //if (useFieldInformation)
      //  htmlscores = documentTerms[2];


      //scan the list of the j-th document's terms
      final int length = documentTerms0.length;
      
      for (int k = 0; k < length; k++) {
        //if the k-th term of the document is to be indexed in this pass
        int codePairIndex = codesHashMap.get(documentTerms0[k]);


        if (codePairIndex > 0) {
          /* need to decrease codePairIndex because it has been already 
           * increased while storing in codesHashMap */
          codePairIndex--;
          TIntArrayList[] tmpMatrix = tmpStorage[codePairIndex];


          tmpMatrix[0].add(p);
          tmpMatrix[1].add(termfreqs[k]);
          if (_useFieldInformation)
          {
            for(int fi = 0; fi < fieldCount; fi++)
              tmpMatrix[2+fi].add(documentTerms[fi+2][k]);
            //tmpMatrix[2].add(htmlscores[k]);
          }
        }
      }
      p++;
    }
    directInputStream.close();
  }
  
  /** Writes the section of the inverted file 
   * @param dos a temporary data structure that contains the offsets in the inverted
   *  index for each term.
   * @param tmpStorage Occurrences information, as described in traverseDirectFile().
   *  This data is consumed by this method - once this method has been called, all
   *  the data in tmpStorage will be destroyed.
   * @param _processTerms The number of terms being processed in this iteration.
   * @return the number of tokens processed in this iteration */
  protected long writeInvertedFilePart(
    final DataOutputStream dos, 
    TIntArrayList[][] tmpStorage, 
    final int _processTerms)
    throws IOException
  {
    BitIndexPointer p = new SimpleBitIndexPointer();
    //write to the inverted file. We should note that the lexicon 
    //should be updated with the start bit and byte offset for this
    //set of postings.
    int frequency; long numTokens = 0;
    for (int j = 0; j < _processTerms; j++) {


      
      frequency = 0; //the term frequency
      
      final int[][] tmpMatrix = new int[2+fieldCount][]; 
      for(int k=0;k<2+fieldCount;k++)
      {
        tmpMatrix[k] = tmpStorage[j][k].toNativeArray();
      }
      tmpStorage[j] = null;
      
      final int[] tmpMatrix0 = tmpMatrix[0];
      final int[] tmpMatrix1 = tmpMatrix[1];
      
      p.setOffset(file.getByteOffset(), file.getBitOffset());
      p.setNumberOfEntries(tmpMatrix0.length);
      p.write(dos);


      //THIS IS ALWAYS AN ERROR
      /*
      if (tmpMatrix[0].length == 0)
      {
        logger.error("Term had no postings - is this right?");
        //This term has no postings
        continue;
      }*/
      
      //write the first entry
      int docid;
      file.writeGamma((docid = tmpMatrix0[0]) + 1);
      int termfreq = tmpMatrix1[0];
      frequency += termfreq;
      file.writeUnary(termfreq);
      
      if (useFieldInformation)
      {
        for(int fi = 0; fi < fieldCount;fi++)
        {
          file.writeUnary(tmpMatrix[2+fi][0]+1);
        }
        for (int k = 1; k < tmpMatrix0.length; k++) {
          file.writeGamma(tmpMatrix0[k] - docid);
          docid = tmpMatrix0[k];
          termfreq   = tmpMatrix1[k];
          frequency += termfreq;
          file.writeUnary(termfreq);
          for(int fi = 0; fi < fieldCount;fi++)
          {
            file.writeUnary(tmpMatrix[2+fi][k]+1);
          }
        }
      }
      else
      {
        for (int k = 1; k < tmpMatrix0.length; k++) {
          file.writeGamma(tmpMatrix0[k] - docid);
          docid = tmpMatrix0[k];
          termfreq = tmpMatrix1[k];
          frequency += termfreq;
          file.writeUnary(termfreq);
        }
      }
      
      //long endOffset = file.getByteOffset();
      //byte endBitOffset = file.getBitOffset();
      //endBitOffset--;
      //if (endBitOffset < 0 && endOffset > 0) {
      //  endBitOffset = 7;
      //  endOffset--;
      //}
      numTokens += frequency;
      //dos.writeInt(frequency);
      
    }
    return numTokens;
  }
  
  /**
   * The number of terms for which the inverted file 
   * is built each time. The corresponding property
   * is <tt>invertedfile.processterms</tt> and the 
   * default value is <tt>75000</tt>. The higher the
   * value, the greater the requirements for memory are, 
   * but the less time it takes to invert the direct 
   * file. 
   */
  protected int processTerms = Integer.parseInt(ApplicationSetup.getProperty("invertedfile.processterms", "75000"));
  /**
   * display memory usage
   * @param r
   */
  public static void displayMemoryUsage(Runtime r)
  {
    if (logger.isDebugEnabled())
      logger.debug("free: "+ (r.freeMemory() /1024) + "kb; total: "+(r.totalMemory()/1024)
          +"kb; max: "+(r.maxMemory()/1024)+"kb; "+
          Rounding.toString((100.0d*r.freeMemory() / r.totalMemory()),1)+"% free; "+
          Rounding.toString((100.0d*r.totalMemory() / r.maxMemory()),1)+"% allocated; "
    );
  }




  /** 
   * get LexiconOutputStream
   * @param _structureName
   * @return LexiconOutputStream<String>
   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  protected LexiconOutputStream<String> getLexOutputStream(String _structureName) throws IOException
  {
    return new FSOMapFileLexiconOutputStream(
        index.getPath(), index.getPrefix(), 
        _structureName, 
        (FixedSizeWriteableFactory<Text>)index.getIndexStructure("lexicon-keyfactory"));
  }


}
Source Code of org.terrier.structures.indexing.InvertedIndexBuilder

Related Classes of org.terrier.structures.indexing.InvertedIndexBuilder